#if (!require("BiocManager", quietly = TRUE))
 #   install.packages("BiocManager")

#BiocManager::install("TxDb.Hsapiens.UCSC.hg38.knownGene")

loadlibs <- function(){
library(reshape)
library(pheatmap)
library(RColorBrewer)
library(DT)
library(ggrepel)
library(viridis)
#library(Repitools)
library(knitr)
library(DESeq2)
library(DiffBind)
library(ChIPpeakAnno)
library(ChIPseeker)
library(ensembldb)
library(EnsDb.Hsapiens.v86)
library(AnnotationHub)
library(rtracklayer) 
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(tidyverse)  
library(ChIPseeker)
library(clusterProfiler)
library(rtracklayer)
library(ChIPpeakAnno)
}
suppressPackageStartupMessages(loadlibs())

knitr::opts_chunk$set(
    cache = FALSE,
    dev = c("png", "pdf"),
    error = TRUE,
    highlight = TRUE,
    message = FALSE,
    prompt = FALSE,
    tidy = FALSE,
    warning = FALSE,
    fig.align = 'center')
#getwd()
if (file.exists("./_setup.R")){
source("_setup.R")
}
# Set seed for reproducibility
set.seed(1454944673L)
viridis_col <- viridis_pal()(20)

1 Overview

  • Principal Investigator: Carla Kim
  • Researcher: Marghe
  • Experiment: BRG1 gene was knocked out in three celllines. The ATACseq experiemnt was performed on Brg1 KO and Brg1 WT This experiment analyzed ATACseq in three cells line where BRG1 was knocked out.
  • Experimental details:
  • Technical details:
    • 6 - samples
    • ATACseq
    • No replicates
## Load sample data
samples <- read.csv('../meta/Brg1_metadatasheet_NF.csv')
#head(samples)
# Load metadata
meta <- read.csv("../meta/Brg1_metadatasheet_NF_2.csv")
# Load counts 
counts_con <- read.delim("../data/consensus/consensus-counts.tsv", row.names=1)
#meta
sanitize_datatable(meta, style='bootstrap')
# Read in data
counts <- read.delim("../meta/multiBAMsummary.tab", sep="\t")

# Remove genomic coordinate information
plot_counts <- data.frame(counts[, 4:ncol(counts)])


# Change column names
colnames(plot_counts) <- colnames(plot_counts) %>% 
  str_replace( "X.", "") %>% 
  str_replace(".NF.", "") 

#all(meta$shortname %in% colnames(plot_counts))
plot_counts <- plot_counts[ ,meta$sample]


annotation <- as.data.frame(meta[,c("Phenotype", "Treatment")])

#all(meta$sample == colnames(plot_counts))

rownames(annotation) <- meta$sample
heat.colors <- brewer.pal(6, "YlOrRd")

#pheatmap(cor(plot_counts), color=heat.colors, annotation=annotation, fontsize = 8)
cat('\n\n')

1.1 Load Peakfiles

## loading packages

txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene


NFnarrowPeak_tib <- meta

for (samplename in NFnarrowPeak_tib$SampleID) {
    peaks_file <-read.delim(paste0("../data/peakcalls/",samplename,"_peaks.narrowPeak"), header =F)

  df <- data.frame(peak_enrichment = peaks_file$V7, peak_rank = rank(dplyr::desc(peaks_file$V7))) %>% 
  dplyr::arrange(peak_rank)
  
  assign(paste0(samplename,"_peak_file"), df)
}
for (sample_ID in NFnarrowPeak_tib$SampleID) {
  obj <- ChIPpeakAnno::toGRanges(paste0("../data/peakcalls/",sample_ID,"_peaks.narrowPeak"), format="narrowPeak", header=FALSE)  
  assign(paste0(sample_ID,"_grange_file"), obj)
}

1.2 Cell line Analysis

1.3 H460

1.3.1 WT vs KO

  • Overlapping peaks: The Venn diagram gives us a representation of the number of peaks overlapping between WT and KO sample.
## check the genomic element distribution of the duplicates
## the genomic element distribution will indicates the 
## the correlation between duplicates.

peaks <- GRangesList(`S7_H460_WT-NF_grange_file`, `S9_H2009_WT-NF_grange_file`)
oh4 <- findOverlapsOfPeaks(`S7_H460_WT-NF_grange_file`, `S8_H460_KO-NF_grange_file`, connectedPeaks= "merge") 
#oh4$peaklist[["S7_H460_WT.NF_grange_file///S8_H460_KO.NF_grange_file"]][1:2]


makeVennDiagram(oh4, fill=c("#009E73", "#F0E442"), # circle fill color
                col=c("#D55E00", "#0072B2"), #circle border color
                cat.col=c("#D55E00", "#0072B2")) # label color, keep same as circle border color

## $p.value
##      S7_H460_WT.NF_grange_file S8_H460_KO.NF_grange_file pval
## [1,]                         1                         1    0
## 
## $vennCounts
##      S7_H460_WT.NF_grange_file S8_H460_KO.NF_grange_file Counts
## [1,]                         0                         0      0
## [2,]                         0                         1  37778
## [3,]                         1                         0  84807
## [4,]                         1                         1  71237
## attr(,"class")
## [1] "VennCounts"
cat('\n\n')

Getting the unique peaks from WT and KO and looking for annotations.

1.3.2 Unique WT peaks and Unique KO peaks

Below plots shows the annotation distribution for Unique WT peaks n= 84807 Unique KO peaks n= 37778

library(EnsDb.Hsapiens.v75) ##(hg19)
## create annotation file from EnsDb or TxDb
annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene")
#annoData[1:2]

peaks <- GRangesList(H460_unique_WT=oh4$peaklist$S7_H460_WT.NF_grange_file,
                    H460_unique_KO=oh4$peaklist$S8_H460_KO.NF_grange_file)
genomicElementDistribution(peaks, 
                           TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene,
                           promoterRegion=c(upstream=2000, downstream=500),
                           geneDownstream=c(upstream=0, downstream=2000))

#metagenePlot(peaks, TxDb.Hsapiens.UCSC.hg19.knownGene)
cat('\n\n')

1.4 H2009

peaks9 <- GRangesList(`S9_H2009_WT-NF_grange_file`,`S10_H2009_KO-NF_grange_file`)
oh9 <- findOverlapsOfPeaks(`S9_H2009_WT-NF_grange_file`,`S10_H2009_KO-NF_grange_file`, connectedPeaks= "merge") 
#oh9$peaklist[["S9_H2009_WT.NF_grange_file///S10_H2009_KO.NF_grange_file"]][1:2]


makeVennDiagram(oh9, fill=c("#009E73", "#F0E442"), # circle fill color
                col=c("#D55E00", "#0072B2"), #circle border color
                cat.col=c("#D55E00", "#0072B2")) # label color, keep same as circle border color

## $p.value
##      S9_H2009_WT.NF_grange_file S10_H2009_KO.NF_grange_file pval
## [1,]                          1                           1    0
## 
## $vennCounts
##      S9_H2009_WT.NF_grange_file S10_H2009_KO.NF_grange_file Counts
## [1,]                          0                           0      0
## [2,]                          0                           1  68028
## [3,]                          1                           0  65486
## [4,]                          1                           1 102897
## attr(,"class")
## [1] "VennCounts"

H2009 sample seems to have similar peak calling however WT has more peaks than KO.

1.4.1 Unique WT peaks and Unique KO peaks

Below plots shows the annotation distribution for Unique WT peaks n= 65486 Unique KO peaks n= 37778

peaks9 <- GRangesList(H2009_unique_WT=oh9$peaklist$S9_H2009_WT.NF_grange_file,
                     H2009_unique_KO=oh9$peaklist$S10_H2009_KO.NF_grange_file)
genomicElementDistribution(peaks9, 
                           TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene,
                           promoterRegion=c(upstream=2000, downstream=500),
                           geneDownstream=c(upstream=0, downstream=2000))

cat('\n\n')

1.5 Calu6

peaksC <- GRangesList(`S11_Calu6_WT-NF_grange_file`,`S12_Calu6_KO-NF_grange_file`)
ohC <- findOverlapsOfPeaks(`S11_Calu6_WT-NF_grange_file`,`S12_Calu6_KO-NF_grange_file`, connectedPeaks= "merge") 
#ohC$peaklist[["S11_Calu6_WT-NF_grange_file///S12_Calu6_KO-NF_grange_file"]][1:2]


makeVennDiagram(ohC, fill=c("#009E73", "#F0E442"), # circle fill color
                col=c("#D55E00", "#0072B2"), #circle border color
                cat.col=c("#D55E00", "#0072B2")) # label color, keep same as circle border color

## $p.value
##      S11_Calu6_WT.NF_grange_file S12_Calu6_KO.NF_grange_file pval
## [1,]                           1                           1    0
## 
## $vennCounts
##      S11_Calu6_WT.NF_grange_file S12_Calu6_KO.NF_grange_file Counts
## [1,]                           0                           0      0
## [2,]                           0                           1  31687
## [3,]                           1                           0 126070
## [4,]                           1                           1  66548
## attr(,"class")
## [1] "VennCounts"

Similar pattern in Calu6 where WT has higher number of peaks called than KO.

1.5.1 Unique WT peaks and Unique KO peaks

Below plots shows the annotation distribution for Unique WT peaks n= 126070 Unique KO peaks n= 31687

peaksC <- GRangesList(Calu6_unique_WT=ohC$peaklist$S11_Calu6_WT.NF_grange_file,
                     Calu6_unique_KO=ohC$peaklist$S12_Calu6_KO.NF_grange_file)
genomicElementDistribution(peaksC, 
                           TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene,
                           promoterRegion=c(upstream=2000, downstream=500),
                           geneDownstream=c(upstream=0, downstream=2000))

cat('\n\n')

However peak distribution in Calu6 KO sample shows more Promter, and Distal Intergenic regions.

1.6 Consensus Peaks in three WT and KO samples

1.6.1 Finding consesus between Three cell-line samples

Peak Overlap Between WT samples I treated all three WT samples as replicates and attempted to call peaks after combining peaks from three samples.

#ol <- findOverlapsOfPeaks(`S7_H460_WT-NF_grange_file`, `S9_H2009_WT-NF_grange_file`, `S11_Calu6_WT-NF_grange_file`, connectedPeaks="keepAll")

# WT Brg1
olaps_wt <- findOverlapsOfPeaks(`S7_H460_WT-NF_grange_file`, `S9_H2009_WT-NF_grange_file`, `S11_Calu6_WT-NF_grange_file`, connectedPeaks="merge")

ven<-makeVennDiagram(olaps_wt, totalTest=3e+3, connectedPeaks = "merge",
                fill = c("cornflowerblue", "orange", "brown1"), NameOfPeaks = c("H460_WT", "H2009_WT", "Calu6_WT"))

overlaps_wt <- olaps_wt$peaklist$`S7_H460_WT.NF_grange_file///S9_H2009_WT.NF_grange_file///S11_Calu6_WT.NF_grange_file`
## add metadata (mean of score) to the overlapping peaks
#ol <- addMetadata(ol, colNames="score", FUN=mean) 
cat('\n\n')

Peak Overlap Between KO samples

#ok <- findOverlapsOfPeaks(`S8_H460_KO-NF_grange_file`, `S10_H2009_KO-NF_grange_file`, `S12_Calu6_KO-NF_grange_file`, connectedPeaks="keepAll")

# WT Brg1
olaps_ko <- findOverlapsOfPeaks(`S8_H460_KO-NF_grange_file`, `S10_H2009_KO-NF_grange_file`, `S12_Calu6_KO-NF_grange_file`, connectedPeaks="merge")

ven<-makeVennDiagram(olaps_ko, totalTest=3e+3, connectedPeaks = "merge",
                fill = c("cornflowerblue", "orange", "brown1"), NameOfPeaks = c("H460_KO", "H2009_KO", "Calu6_KO"))

overlaps_ko <- olaps_ko$peaklist$`S8_H460_KO.NF_grange_file///S10_H2009_KO.NF_grange_file///S12_Calu6_KO.NF_grange_file`
cat('\n\n')

There are rlength(olaps_ko\(peaklist\)S8_H460_KO.NF_grange_file///S10_H2009_KO.NF_grange_file///S12_Calu6_KO.NF_grange_file)` regions common in KO samples.

1.6.2 Obtain the WT vs KO analysis from consensus regions from cell lines

#WT_cons vs KO_cons
olaps_final <- findOverlapsOfPeaks(overlaps_wt,overlaps_ko,connectedPeaks="merge")

ven<-makeVennDiagram(olaps_final, connectedPeaks = "merge")

olaps_cons_wt <- olaps_final$peaklist$overlaps_wt

olaps_cons_ko <- olaps_final$peaklist$overlaps_ko

peaks_con_list <- GRangesList(WT=olaps_cons_wt, KO=olaps_cons_ko)
cat('\n\n')

It looks like there are 18995 unique in WT samples while 7630 unique to KO samples.

1.6.3 WT and KO peak features comparision

If no precedence specified, double count will be enabled, which means that if a peak overlap with both promoter and 5’UTR, both promoter and 5’UTR will be incremented. If a precedence order is specified, for example, if promoter is specified before 5’UTR, then only promoter will be incremented for the same example. The values could be any conbinations of “Promoters”, “immediateDownstream”, “fiveUTRs”, “threeUTRs”, “Exons” and “Introns”, Default=NULL

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)
library(patchwork)
annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene, feature="gene")
#annoData
data(TSS.human.NCBI36)
##WT
annotatedPeak_wt <- annotatePeakInBatch(olaps_cons_wt, 
                 AnnotationData=TSS.human.NCBI36)
#annotatedPeak_wt[1:3]

#pie1(table(as.data.frame(annotatedPeak_wt)$insideFeature)) 
if(require(TxDb.Hsapiens.UCSC.hg19.knownGene)){
    aCR<-assignChromosomeRegion(annotatedPeak_wt, nucleotideLevel=FALSE, 
                            precedence=c("Promoters", "immediateDownstream", 
                                          "fiveUTRs", "threeUTRs", 
                                          "Exons", "Introns"),
                           TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene)
    barplot(aCR$percentage) 
    title(main = "WT plot", font = 4)
}

annotatedPeak_wt <- addGeneIDs(annotatedPeak_wt, orgAnn="org.Hs.eg.db", 
                   feature_id_type="ensembl_gene_id",
                   IDs2Add=c("symbol"))
#head(annotatedPeak_wt)
#head(overlaps.anno)
write.csv(as.data.frame(unname(annotatedPeak_wt)), "WT_consensus_anno.csv")

##KO
annotatedPeak_ko <- annotatePeakInBatch(olaps_cons_ko, 
                 AnnotationData=TSS.human.NCBI36)
#annotatedPeak_wt[1:3]
#pie1(table(as.data.frame(annotatedPeak_ko)$insideFeature)) 
if(require(TxDb.Hsapiens.UCSC.hg19.knownGene)){
    aCR<-assignChromosomeRegion(annotatedPeak_ko, nucleotideLevel=FALSE, 
                            precedence=c("Promoters", "immediateDownstream", 
                                          "fiveUTRs", "threeUTRs", 
                                          "Exons", "Introns"),
                           TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene)
    barplot(aCR$percentage) 
    title(main = "KO plot", font = 4)
}

annotatedPeak_ko <- addGeneIDs(annotatedPeak_ko, orgAnn="org.Hs.eg.db", 
                   feature_id_type="ensembl_gene_id",
                   IDs2Add=c("symbol"))
#head(annotatedPeak_ko)

write.csv(as.data.frame(unname(annotatedPeak_ko)), "KO_consensus_anno.csv")

The label missing the barplot is “immediateDownstream”, due to size it’s not able to fit in xlabel text window.

1.6.4 Pathway analysis

library(GO.db)
library(patchwork)
library(org.Hs.eg.db)
library( reactome.db )

wt <- getEnrichedPATH(annotatedPeak_wt, "org.Hs.eg.db", "reactome.db", maxP=.05)
wt <- enrichmentPlot(wt) + ggtitle('WT')
wt

#KO
ko <- getEnrichedPATH(annotatedPeak_ko, "org.Hs.eg.db", "reactome.db", maxP=.05)
ko <- enrichmentPlot(ko) + ggtitle('KO')


ko

cat('\n\n')

1.7 DiffBind Analysis

we need to see how samples might be segregating and see if the variability observed can be explained by any other covariates. Identifying the main sources of variation will allow us to better model the data, and also validate outlier samples.

1.7.1 PCA

# Load counts 
samples <- read.csv("../meta/metadata_rm.csv")
dObj <- dba(sampleSheet=samples)
dObj <- dba.count(dObj)
info <- dba.show(dObj)
libsizes <- cbind(LibReads=info$Reads, FRiP=info$FRiP,PeakReads=round(info$Reads * info$FRiP))
rownames(libsizes) <- info$ID

#Normalizing the data
dObj <- dba.normalize(dObj)
norm <- dba.normalize(dObj, bRetrieve=TRUE)
normlibs <- cbind(FullLibSize=norm$lib.sizes, NormFacs=norm$norm.factors, NormLibSize=round(norm$lib.sizes/norm$norm.factors))
rownames(normlibs) <- info$ID


#Establishing a model design and contrast

dObj <- dba.contrast(dObj, categories=DBA_TREATMENT)
dba.plotPCA(dObj)
#Performing the differential analysis
#dObj <- dba.analyze(dObj)
dObj <- dba.analyze(dObj, method=DBA_ALL_METHODS)
#Retrieving the differentially bound sites
dObj.DB <- dba.report(dObj)
#print("no genes found with fold change more than 0")
#sum(dObj.DB$Fold>0)
#print("genes found to have fold change less than 0")
#sum(dObj.DB$Fold<0)
meta_H4 <- read.csv("../meta/metadata_rm.csv")

meta_H4 <- meta_H4 %>% 
  column_to_rownames(var="sample")
counts_H4 <- counts_con[,rownames(meta_H4)]

#all 
dds_all <- DESeqDataSetFromMatrix(counts_H4, meta_H4, design=~cellline + Factor)
dds_all <- DESeq(dds_all)

cat('\n\n')
#meta

From the PCA we observe a samples from same celllines are close which associates with the sample groups.

# Matrix of transformed counts for downstream visualization
rld1 <- rlog(dds_all, blind = TRUE)

# Compute principal components
pc <- prcomp(t(assay(rld1)))
plot_pca <- data.frame(pc$x, colData(dds_all))

# Plot with sample names used as data points
ggplot(plot_pca, aes(x = PC1, y = PC2, label = rownames(plot_pca))) + 
  geom_point(aes(color = cellline,shape = Factor), size = 3) +
  geom_text_repel() +
  xlab('PC1') + ylab('PC2') +
  theme_bw() +
  theme(axis.title = element_text(size = rel(1.25)),
        axis.text = element_text(size = rel(1.15)),
        plot.title = element_text(size = rel(1.5), hjust = 0.5),
        legend.title = element_blank()) +
  scale_color_manual(values=viridis_col[c(1,20)])

cat('\n\n')
cat('\n\n')

1.8 Differentially expressed genes after removing H2009 cellline

We do not have replicates in this study, so we may not be able to get differential expressed genes from two conditions i.e. WT and KO for each cell line. However, since we have Brg1 KO and WT samples from three cell lines,i.e. H460, H2009 and Calu6 we may try finding DE genes bt treatind each condition sample as replicates ie. WT samples vs KO samples.

1.8.1 Dispersion plot

We often look at the dispersion plot to get a good idea of whether or not our data is a good fit for the model. Dispersion is a metric for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene and is used in the final GLM fit. From this plot we see that:

  • There is an expected inverse relationship between dispersion and mean expression
  • There is very little shrinkage of values, indicating that the initial estimates are pretty accurate (because of the larger sample sizes per group)
plotDispEsts(dds_all)

# filter
keep <- rowSums(counts(dds_all) >= 10) >= 3
dds_all <- dds_all[keep,]
dds_all <- DESeq(dds_all)

# plot again
plotDispEsts(dds_all)

cat('\n\n')

1.8.2 MA plot

The MA plot explores the mean expression level of the genes with the fold change, highlighting the regions that are differentially enriched (padj < 0.05) using blue colored data points.

# Get results
res_all <- results(dds_all,contrast=c("Factor","WT","KO"))
resultsNames(dds_all)
## [1] "Intercept"              "cellline_H460_vs_Calu6" "Factor_WT_vs_KO"
res_all <- lfcShrink(dds_all, coef="Factor_WT_vs_KO", type="apeglm")

plotMA(res_all, alpha = 0.5)

#res_all <- sort(res_all@listData[["padj"]])
cat('\n\n')

1.8.3 Volcano plots

At a padj < 0.05, we find:

  • There are 284 regions that are significantly changing between WT and KO .

Here, we plot the log2 fold change of each region against the log10 adjusted p-value. The points highlighted in teal are regions that have padj < 0.05.

The contrast is setup such that positive logFC means that there is an increased enrichment of a region in WT compared to KO.

nrow(res_all[which(res_all$padj < 0.05),])
## [1] 284
# Create dataframe
res_all <- res_all %>% 
   as.data.frame() %>% 
  mutate(threshold = padj < 0.05) %>% 
  arrange(padj)

res_all_filt <- res_all %>% 
   as.data.frame() %>% 
  mutate(threshold = padj < 0.05) %>% 
  arrange(padj) %>% filter(threshold == "TRUE") 

## Volcano plot
p1 <- ggplot(res_all) +
  geom_point(aes(log2FoldChange, y = -log10(padj), colour = threshold)) +
  xlab("log2 fold change") +
  ylab("-log10 adjusted p-value") +
  ggtitle("WT vs KO") +
  theme_bw() +
  theme(
    legend.position = "none",
    plot.title = element_text(size = rel(2.0), hjust = 0.5),
    axis.title = element_text(size = rel(1.25))
  )



p1

cat('\n\n')

1.8.4 Annotation Distribution

library(GenomicFeatures)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene #shorthand (for convenience)


# Make TxDb

#txdb
# Create  GRanges
gr_all <- data.frame(rows = rownames(res_all_filt)) %>% 
  separate(rows, c("chr", "start", "end")) %>% 
  cbind(res_all_filt) %>% 
  remove_rownames() %>% 
  makeGRangesFromDataFrame(keep.extra.columns=TRUE,
                         ignore.strand=TRUE,
                         seqnames.field=c("chr"),
                         start.field="start",
                         end.field=c("end"))



# Annotate
WT_KO <- annotatePeak(gr_all, tssRegion=c(-3000, 3000), TxDb=txdb, annoDb="org.Hs.eg.db")
## >> preparing features information...      2022-04-12 21:14:33 
## >> identifying nearest features...        2022-04-12 21:14:34 
## >> calculating distance from peak to TSS...   2022-04-12 21:14:34 
## >> assigning genomic annotation...        2022-04-12 21:14:34 
## >> adding gene annotation...          2022-04-12 21:14:45 
## >> assigning chromosome lengths           2022-04-12 21:14:45 
## >> done...                    2022-04-12 21:14:45
plotAnnoPie(WT_KO)

#plotAnnoBar(WT_KO)
#upsetplot(WT_KO)
#vennpie(WT_KO)
library(ggupset)
library(ggimage)
#upsetplot(WT_KO, vennpie=TRUE)


# Query AnnotationHub to get gene symbols
ah <- AnnotationHub()

# Query AnnotationHub
human_ens <- query(ah, c("Homo sapiens", "EnsDb"))
# Extract annotations of interest
human_ens <- human_ens[["AH75011"]]

# Extract gene-level information
#genes(human_ens, return.type = "data.frame") %>% head()


# Create a gene-level dataframe 
annotations_ahb <- genes(human_ens, return.type = "data.frame")  %>%
  dplyr::select(gene_name,gene_id, entrezid, gene_biotype,symbol)


# Keep one entrez Id per gene
#annotations_ahb$entrezid <- map(annotations_ahb$entrezid,1) %>%  unlist()

# Combine annotations with results
WT_KO_all <- WT_KO@anno %>%
  data.frame() %>%
  dplyr::select(-geneChr, -geneStart, -geneEnd, -geneLength, -geneStrand) %>% 
  left_join(annotations_ahb, by=c("geneId" = "gene_id"))

#WT_KO_all
plotDistToTSS(WT_KO,
              title="Distribution of transcription factor-binding loci\nrelative to TSS")

cat('\n\n')

1.9 Table of significant DE regions

Below we have separated significant peaks by direction of change and report a few of the statistic columns.

1.9.1 Open regions in samples

There is only one region found which is significantly different in WT vs KO.

library(tidyverse)

sigUp <- WT_KO_all %>% 
  dplyr::filter(threshold == "TRUE" & log2FoldChange > 0) %>% 
  arrange(padj)  %>% 
  mutate_if(is.numeric, round, digits = 7)

sanitize_datatable(sigUp[,c("seqnames", "start", "end", "SYMBOL" , "annotation", "log2FoldChange", "pvalue", "padj","gene_name")])
cat('\n\n')

1.10 Differentially expressed genes after removing H460 cellline

I removed the H460 cell-line from the set.

meta_H9 <- read.csv("../meta/metadata_rm2.csv")

meta_H9 <- meta_H9 %>% 
  column_to_rownames(var="sample")
counts_H9 <- counts_con[,rownames(meta_H9)]

#all 
dds_all <- DESeqDataSetFromMatrix(counts_H9, meta_H9, design=~cellline + Factor)
dds_all <- DESeq(dds_all)

cat('\n\n')
#meta

From the PCA we observe a samples from same celllines are close which associates with the sample groups.

# Matrix of transformed counts for downstream visualization
rld1 <- rlog(dds_all, blind = TRUE)

# Compute principal components
pc <- prcomp(t(assay(rld1)))
plot_pca <- data.frame(pc$x, colData(dds_all))

# Plot with sample names used as data points
ggplot(plot_pca, aes(x = PC1, y = PC2, label = rownames(plot_pca))) + 
  geom_point(aes(color = cellline,shape = Factor), size = 3) +
  geom_text_repel() +
  xlab('PC1') + ylab('PC2') +
  theme_bw() +
  theme(axis.title = element_text(size = rel(1.25)),
        axis.text = element_text(size = rel(1.15)),
        plot.title = element_text(size = rel(1.5), hjust = 0.5),
        legend.title = element_blank()) +
  scale_color_manual(values=viridis_col[c(1,20)])

cat('\n\n')
cat('\n\n')

1.10.1 Dispersion plot

plotDispEsts(dds_all)

# filter
keep <- rowSums(counts(dds_all) >= 10) >= 3
dds_all <- dds_all[keep,]
dds_all <- DESeq(dds_all)

# plot again
plotDispEsts(dds_all)

cat('\n\n')

1.10.2 MA plot

The MA plot explores the mean expression level of the genes with the fold change, highlighting the regions that are differentially enriched (padj < 0.05) using blue colored data points.

# Get results
res_all <- results(dds_all,contrast=c("Factor","WT","KO"))
resultsNames(dds_all)
## [1] "Intercept"               "cellline_H2009_vs_Calu6"
## [3] "Factor_WT_vs_KO"
res_all <- lfcShrink(dds_all, coef="Factor_WT_vs_KO", type="apeglm")

plotMA(res_all, alpha = 0.5)

#res_all <- sort(res_all@listData[["padj"]])
cat('\n\n')

1.10.3 Volcano plots

At a padj < 0.05, we find:

  • There are 2 regions that are significantly changing between WT and KO .

Here, we plot the log2 fold change of each region against the log10 adjusted p-value. The points highlighted in teal are regions that have padj < 0.05.

The contrast is setup such that positive logFC means that there is an increased enrichment of a region in WT compared to KO.

nrow(res_all[which(res_all$padj < 0.05),])
## [1] 2
# Create dataframe
res_all <- res_all %>% 
   as.data.frame() %>% 
  mutate(threshold = padj < 0.05) %>% 
  arrange(padj)

res_all_filt <- res_all %>% 
   as.data.frame() %>% 
  mutate(threshold = padj < 0.05) %>% 
  arrange(padj) %>% filter(threshold == "TRUE") 

## Volcano plot
p1 <- ggplot(res_all) +
  geom_point(aes(log2FoldChange, y = -log10(padj), colour = threshold)) +
  xlab("log2 fold change") +
  ylab("-log10 adjusted p-value") +
  ggtitle("WT vs KO") +
  theme_bw() +
  theme(
    legend.position = "none",
    plot.title = element_text(size = rel(2.0), hjust = 0.5),
    axis.title = element_text(size = rel(1.25))
  )



p1

cat('\n\n')

1.10.4 Annotation Distribution

library(GenomicFeatures)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene #shorthand (for convenience)


# Make TxDb

#txdb
# Create  GRanges
gr_all <- data.frame(rows = rownames(res_all_filt)) %>% 
  separate(rows, c("chr", "start", "end")) %>% 
  cbind(res_all_filt) %>% 
  remove_rownames() %>% 
  makeGRangesFromDataFrame(keep.extra.columns=TRUE,
                         ignore.strand=TRUE,
                         seqnames.field=c("chr"),
                         start.field="start",
                         end.field=c("end"))



# Annotate
WT_KO <- annotatePeak(gr_all, tssRegion=c(-3000, 3000), TxDb=txdb, annoDb="org.Hs.eg.db")
## >> preparing features information...      2022-04-12 21:18:11 
## >> identifying nearest features...        2022-04-12 21:18:11 
## >> calculating distance from peak to TSS...   2022-04-12 21:18:11 
## >> assigning genomic annotation...        2022-04-12 21:18:11 
## >> adding gene annotation...          2022-04-12 21:18:13 
## >> assigning chromosome lengths           2022-04-12 21:18:13 
## >> done...                    2022-04-12 21:18:13
plotAnnoPie(WT_KO)

#plotAnnoBar(WT_KO)
#upsetplot(WT_KO)
#vennpie(WT_KO)
library(ggupset)
library(ggimage)
#upsetplot(WT_KO, vennpie=TRUE)


# Query AnnotationHub to get gene symbols
ah <- AnnotationHub()

# Query AnnotationHub
human_ens <- query(ah, c("Homo sapiens", "EnsDb"))
# Extract annotations of interest
human_ens <- human_ens[["AH75011"]]

# Extract gene-level information
#genes(human_ens, return.type = "data.frame") %>% head()


# Create a gene-level dataframe 
annotations_ahb <- genes(human_ens, return.type = "data.frame")  %>%
  dplyr::select(gene_name,gene_id, entrezid, gene_biotype,symbol)


# Keep one entrez Id per gene
#annotations_ahb$entrezid <- map(annotations_ahb$entrezid,1) %>%  unlist()

# Combine annotations with results
WT_KO_all <- WT_KO@anno %>%
  data.frame() %>%
  dplyr::select(-geneChr, -geneStart, -geneEnd, -geneLength, -geneStrand) %>% 
  left_join(annotations_ahb, by=c("geneId" = "gene_id"))

#WT_KO_all
plotDistToTSS(WT_KO,
              title="Distribution of transcription factor-binding loci\nrelative to TSS")
## Error in seq.default(-uslim, dslim, by = 10): 'from' must be a finite number
cat('\n\n')

1.11 Table of significant DE regions

Below we have separated significant peaks by direction of change and report a few of the statistic columns.

1.11.1 Open regions in samples

There is only one region found which is significantly different in WT vs KO.

library(tidyverse)

sigUp <- WT_KO_all %>% 
  dplyr::filter(threshold == "TRUE" & log2FoldChange > 0) %>% 
  arrange(padj)  %>% 
  mutate_if(is.numeric, round, digits = 7)

sanitize_datatable(sigUp[,c("seqnames", "start", "end", "SYMBOL" , "annotation", "log2FoldChange", "pvalue", "padj","gene_name")])
cat('\n\n')

1.12 R Session

Below is the output of the R session used to generate this report. Included is information on R version, OS, and versions of packages installed and used.

sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggimage_0.3.0                          
##  [2] ggupset_0.3.0                          
##  [3] reactome.db_1.77.0                     
##  [4] GO.db_3.14.0                           
##  [5] patchwork_1.1.1                        
##  [6] org.Hs.eg.db_3.14.0                    
##  [7] EnsDb.Hsapiens.v75_2.99.0              
##  [8] clusterProfiler_4.2.2                  
##  [9] forcats_0.5.1                          
## [10] stringr_1.4.0                          
## [11] dplyr_1.0.8                            
## [12] purrr_0.3.4                            
## [13] readr_2.1.2                            
## [14] tidyr_1.2.0                            
## [15] tibble_3.1.6                           
## [16] tidyverse_1.3.1                        
## [17] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [18] rtracklayer_1.54.0                     
## [19] AnnotationHub_3.2.2                    
## [20] BiocFileCache_2.2.1                    
## [21] dbplyr_2.1.1                           
## [22] EnsDb.Hsapiens.v86_2.99.0              
## [23] ensembldb_2.18.4                       
## [24] AnnotationFilter_1.18.0                
## [25] GenomicFeatures_1.46.5                 
## [26] AnnotationDbi_1.56.2                   
## [27] ChIPseeker_1.30.3                      
## [28] ChIPpeakAnno_3.28.1                    
## [29] DiffBind_3.4.11                        
## [30] DESeq2_1.34.0                          
## [31] SummarizedExperiment_1.24.0            
## [32] Biobase_2.54.0                         
## [33] MatrixGenerics_1.6.0                   
## [34] matrixStats_0.61.0                     
## [35] GenomicRanges_1.46.1                   
## [36] GenomeInfoDb_1.30.1                    
## [37] IRanges_2.28.0                         
## [38] S4Vectors_0.32.4                       
## [39] BiocGenerics_0.40.0                    
## [40] knitr_1.38                             
## [41] viridis_0.6.2                          
## [42] viridisLite_0.4.0                      
## [43] ggrepel_0.9.1                          
## [44] ggplot2_3.3.5                          
## [45] DT_0.22                                
## [46] RColorBrewer_1.1-3                     
## [47] pheatmap_1.0.12                        
## [48] reshape_0.8.9                          
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.3                coda_0.19-4                  
##   [3] bit64_4.0.5                   irlba_2.3.5                  
##   [5] DelayedArray_0.20.0           data.table_1.14.2            
##   [7] hwriter_1.3.2.1               KEGGREST_1.34.0              
##   [9] RCurl_1.98-1.6                generics_0.1.2               
##  [11] lambda.r_1.2.4                RSQLite_2.2.12               
##  [13] shadowtext_0.1.1              tzdb_0.3.0                   
##  [15] bit_4.0.4                     enrichplot_1.14.2            
##  [17] xml2_1.3.3                    lubridate_1.8.0              
##  [19] httpuv_1.6.5                  assertthat_0.2.1             
##  [21] amap_0.8-18                   apeglm_1.16.0                
##  [23] xfun_0.30                     hms_1.1.1                    
##  [25] jquerylib_0.1.4               evaluate_0.15                
##  [27] promises_1.2.0.1              fansi_1.0.3                  
##  [29] restfulr_0.0.13               progress_1.2.2               
##  [31] readxl_1.4.0                  caTools_1.18.2               
##  [33] igraph_1.3.0                  DBI_1.1.2                    
##  [35] geneplotter_1.72.0            htmlwidgets_1.5.4            
##  [37] futile.logger_1.4.3           ellipsis_0.3.2               
##  [39] crosstalk_1.2.0               backports_1.4.1              
##  [41] annotate_1.72.0               biomaRt_2.50.3               
##  [43] vctrs_0.4.0                   cachem_1.0.6                 
##  [45] withr_2.5.0                   ggforce_0.3.3                
##  [47] BSgenome_1.62.0               bdsmatrix_1.3-4              
##  [49] GenomicAlignments_1.30.0      treeio_1.18.1                
##  [51] prettyunits_1.1.1             DOSE_3.20.1                  
##  [53] ape_5.6-2                     lazyeval_0.2.2               
##  [55] crayon_1.5.1                  genefilter_1.76.0            
##  [57] labeling_0.4.2                pkgconfig_2.0.3              
##  [59] tweenr_1.0.2                  nlme_3.1-157                 
##  [61] ProtGenerics_1.26.0           rlang_1.0.2                  
##  [63] lifecycle_1.0.1               downloader_0.4               
##  [65] filelock_1.0.2                modelr_0.1.8                 
##  [67] VennDiagram_1.7.1             invgamma_1.1                 
##  [69] cellranger_1.1.0              polyclip_1.10-0              
##  [71] graph_1.72.0                  Matrix_1.4-1                 
##  [73] aplot_0.1.3                   ashr_2.2-54                  
##  [75] boot_1.3-28                   reprex_2.0.1                 
##  [77] png_0.1-7                     rjson_0.2.21                 
##  [79] bitops_1.0-7                  KernSmooth_2.23-20           
##  [81] Biostrings_2.62.0             blob_1.2.3                   
##  [83] mixsqp_0.3-43                 SQUAREM_2021.1               
##  [85] qvalue_2.26.0                 regioneR_1.26.1              
##  [87] ShortRead_1.52.0              jpeg_0.1-9                   
##  [89] gridGraphics_0.5-1            scales_1.1.1                 
##  [91] memoise_2.0.1                 magrittr_2.0.3               
##  [93] plyr_1.8.7                    gplots_3.1.1                 
##  [95] zlibbioc_1.40.0               compiler_4.1.3               
##  [97] scatterpie_0.1.7              BiocIO_1.4.0                 
##  [99] bbmle_1.0.24                  plotrix_3.8-2                
## [101] Rsamtools_2.10.0              cli_3.2.0                    
## [103] systemPipeR_2.0.6             XVector_0.34.0               
## [105] formatR_1.12                  MASS_7.3-56                  
## [107] tidyselect_1.1.2              stringi_1.7.6                
## [109] highr_0.9                     emdbook_1.3.12               
## [111] yaml_2.3.5                    GOSemSim_2.20.0              
## [113] locfit_1.5-9.5                latticeExtra_0.6-29          
## [115] grid_4.1.3                    sass_0.4.1                   
## [117] fastmatch_1.1-3               tools_4.1.3                  
## [119] parallel_4.1.3                rstudioapi_0.13              
## [121] gridExtra_2.3                 farver_2.1.0                 
## [123] ggraph_2.0.5                  digest_0.6.29                
## [125] BiocManager_1.30.16           shiny_1.7.1                  
## [127] Rcpp_1.0.8.3                  broom_0.7.12                 
## [129] BiocVersion_3.14.0            later_1.3.0                  
## [131] httr_1.4.2                    colorspace_2.0-3             
## [133] rvest_1.0.2                   fs_1.5.2                     
## [135] XML_3.99-0.9                  truncnorm_1.0-8              
## [137] splines_4.1.3                 yulab.utils_0.0.4            
## [139] RBGL_1.70.0                   tidytree_0.3.9               
## [141] graphlayouts_0.8.0            multtest_2.50.0              
## [143] ggplotify_0.1.0               xtable_1.8-4                 
## [145] jsonlite_1.8.0                ggtree_3.2.1                 
## [147] futile.options_1.0.1          tidygraph_1.2.1              
## [149] ggfun_0.0.6                   R6_2.5.1                     
## [151] pillar_1.7.0                  htmltools_0.5.2              
## [153] mime_0.12                     glue_1.6.2                   
## [155] fastmap_1.1.0                 BiocParallel_1.28.3          
## [157] interactiveDisplayBase_1.32.0 fgsea_1.20.0                 
## [159] GreyListChIP_1.26.0           mvtnorm_1.1-3                
## [161] utf8_1.2.2                    lattice_0.20-45              
## [163] bslib_0.3.1                   numDeriv_2016.8-1.1          
## [165] curl_4.3.2                    gtools_3.9.2                 
## [167] magick_2.7.3                  survival_3.3-1               
## [169] limma_3.50.3                  rmarkdown_2.13               
## [171] InteractionSet_1.22.0         munsell_0.5.0                
## [173] DO.db_2.9                     GenomeInfoDbData_1.2.7       
## [175] haven_2.4.3                   reshape2_1.4.4               
## [177] gtable_0.3.0